Chapter 4 MAG catalogue
4.1 filter samples with high host data
sample_metadata <- sample_metadata%>%
filter(!sample %in% c("EHI02721", "EHI02712", "EHI02700", "EHI02720", "EHI02749", "EHI02719", "EHI02729", "EHI02715", "EHI02722"))
genome_counts_filt <- genome_counts %>%
select(one_of(c("genome",sample_metadata$sample)))%>%
filter(rowSums(. != 0, na.rm = TRUE) > 0) %>%
select_if(~!all(. == 0))
genome_metadata <- genome_metadata %>%
semi_join(., genome_counts_filt, by = "genome") %>%
arrange(match(genome,genome_counts_filt$genome))
genome_tree <- keep.tip(genome_tree, tip=genome_metadata$genome) # keep only MAG tips
#load("data/genome_gifts.Rdata")4.2 Genome phylogeny
# Generate the phylum color heatmap
phylum_heatmap <- read_tsv("https://raw.githubusercontent.com/earthhologenome/EHI_taxonomy_colour/main/ehi_phylum_colors.tsv") %>%
right_join(genome_metadata, by=join_by(phylum == phylum)) %>%
arrange(match(genome, genome_tree$tip.label)) %>%
select(genome,phylum) %>%
mutate(phylum = factor(phylum, levels = unique(phylum))) %>%
column_to_rownames(var = "genome")
# Generate new species table
newspecies_table <- genome_metadata %>%
mutate(newspecies=ifelse(species=="s__","Y","N")) %>%
select(genome,newspecies) %>%
column_to_rownames(var = "genome")
# Generate wild table
wild_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(diet=="Wild") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate captive table
captive_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(diet=="Pre_grass") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate grass table
grass_heatmap <- genome_counts_filt %>%
pivot_longer(!genome,names_to="sample",values_to="abundance") %>%
left_join(sample_metadata,by="sample") %>%
filter(diet=="Post_grass") %>%
group_by(genome) %>%
summarise(presence=ifelse(sum(abundance)>0,"present","absent")) %>%
column_to_rownames(var="genome")
# Generate basal tree
circular_tree <- force.ultrametric(genome_tree, method="extend") %>% # extend to ultrametric for the sake of visualisation
ggtree(., layout="fan", open.angle=10, size=0.2)***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
# Add phylum ring
circular_tree <- gheatmap(circular_tree, phylum_heatmap, offset=0.05, width=0.05, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))
# Flush color scale to enable a new color scheme in the next ring
circular_tree <- circular_tree + new_scale_fill()
# Add completeness ring
circular_tree <- circular_tree +
new_scale_fill() +
scale_fill_gradient(low = "#d1f4ba", high = "#f4baba") +
geom_fruit(
data=genome_metadata,
geom=geom_bar,
mapping = aes(x=completeness, y=genome, fill=contamination),
offset = 0.3,
pwidth = 0.1,
orientation="y",
stat="identity")
# Add genome-size ring
circular_tree <- circular_tree + new_scale_fill()
circular_tree <- gheatmap(circular_tree, wild_heatmap, offset=0.3, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#74C8AE")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
circular_tree <- gheatmap(circular_tree, captive_heatmap, offset=0.4, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#F4B02E")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
circular_tree <- gheatmap(circular_tree, grass_heatmap, offset=0.5, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#ffffff","#E6771D")) +
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0)) +
new_scale_fill()
circular_tree <- gheatmap(circular_tree, newspecies_table, offset=1.2, width=0.05, colnames=FALSE) +
scale_fill_manual(values=c("#f4f4f4","#666666"))
theme(legend.position = "none", plot.margin = margin(0, 0, 0, 0), panel.margin = margin(0, 0, 0, 0))List of 3
$ legend.position: chr "none"
$ plot.margin : 'margin' num [1:4] 0points 0points 0points 0points
..- attr(*, "unit")= int 8
$ panel.spacing : 'margin' num [1:4] 0points 0points 0points 0points
..- attr(*, "unit")= int 8
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi FALSE
- attr(*, "validate")= logi TRUE
# Add text
circular_tree <- circular_tree +
annotate('text', x=2.9, y=0, label=' Phylum', family='arial', size=3.5) +
annotate('text', x=3.3, y=0, label=' Group', family='arial', size=3.5) +
annotate('text', x=3.8, y=0, label=' Genome quality', family='arial', size=3.5) +
annotate('text', x=4.2, y=0, label=' New species', family='arial', size=3.5)
#Plot circular tree
circular_tree %>% open_tree(30) %>% rotate_tree(90)4.3 Taxonomy overview
tax_mag <-genome_metadata %>%
group_by(phylum) %>%
summarise(mag_n=n())
tax_mag %>%
mutate(percetage_mag=round(mag_n*100/sum(mag_n), 2)) %>%
arrange(-percetage_mag) %>%
tt()| phylum | mag_n | percetage_mag |
|---|---|---|
| p__Bacillota_A | 526 | 61.16 |
| p__Bacteroidota | 127 | 14.77 |
| p__Bacillota | 53 | 6.16 |
| p__Pseudomonadota | 39 | 4.53 |
| p__Verrucomicrobiota | 35 | 4.07 |
| p__Synergistota | 18 | 2.09 |
| p__Actinomycetota | 15 | 1.74 |
| p__Patescibacteria | 13 | 1.51 |
| p__Desulfobacterota | 12 | 1.40 |
| p__Bacillota_C | 9 | 1.05 |
| p__Cyanobacteriota | 7 | 0.81 |
| p__Bacillota_B | 2 | 0.23 |
| p__Spirochaetota | 2 | 0.23 |
| p__Campylobacterota | 1 | 0.12 |
| p__Methanobacteriota | 1 | 0.12 |
4.4 Mag size (MB)
Mags average size (MB)
| Average_corrected_size |
|---|
| 2651482 |
Minimum Mags size (MB)
| genome | domain | phylum | class | order | family | genus | species | completeness | contamination | length | corrected_size |
|---|---|---|---|---|---|---|---|---|---|---|---|
| EHA03306_bin.234 | d__Bacteria | p__Patescibacteria | c__Saccharimonadia | o__Saccharimonadales | f__Nanosyncoccaceae | g__Nanosyncoccus | s__ | 52.57 | 7.13 | 326026 | 620175 |
Maximum Mags size (MB)
| genome | domain | phylum | class | order | family | genus | species | completeness | contamination | length | corrected_size |
|---|---|---|---|---|---|---|---|---|---|---|---|
| EHA04588_bin.47 | d__Bacteria | p__Bacillota_A | c__Clostridia | o__Oscillospirales | f__Oscillospiraceae | g__Lawsonibacter | s__ | 56.31 | 5.39 | 5576430 | 9903090 |
Mags arrange by size (MB)
Mags average size and completeness by phylum
genome_metadata %>%
group_by(phylum) %>%
summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>%
arrange(-average_size) %>%
select(phylum, Size, Completeness) %>%
tt()| phylum | Size | Completeness |
|---|---|---|
| p__Bacteroidota | 4150649.86±1672281.53 | 76.62±17.72 |
| p__Pseudomonadota | 2604751.8±835336.03 | 82.68±17.96 |
| p__Verrucomicrobiota | 2538115.51±808033.83 | 81.21±14.05 |
| p__Bacillota_A | 2534693.22±882541.15 | 81.5±16.81 |
| p__Desulfobacterota | 2207699.48±163896.46 | 85.49±13.65 |
| p__Synergistota | 2141226.18±176298.47 | 75.8±16.94 |
| p__Spirochaetota | 2053326.59±36521.71 | 94.71±0.12 |
| p__Actinomycetota | 1998831.26±620954.65 | 76.63±18.44 |
| p__Bacillota_B | 1932803.47±73181.05 | 90±2.28 |
| p__Cyanobacteriota | 1877859.37±192085.35 | 92.95±17.85 |
| p__Methanobacteriota | NA | NA |
| p__Bacillota_C | 1638316.23±205183.8 | 90.94±16.02 |
| p__Bacillota | 1551655.29±517109.39 | 83.87±12.72 |
| p__Campylobacterota | NA | NA |
| p__Patescibacteria | 1042754.15±629671.65 | 77.42±17.81 |
Mags average size and completeness by phylum in wild
wild_data <-sample_metadata %>%
filter(diet=="Wild")
wild_genome <- genome_counts %>%
select(genome, all_of(wild_data$sample)) %>%
filter(rowSums(across(where(is.numeric)))!=0)
wild_genome_metadata <- genome_metadata %>%
filter(genome %in% wild_genome$genome) %>%
arrange(match(genome,wild_genome$genome))
wild_genome_metadata %>%
group_by(phylum) %>%
summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>%
arrange(-average_size) %>%
select(phylum, Size, Completeness) %>%
tt()| phylum | Size | Completeness |
|---|---|---|
| p__Bacteroidota | 4150649.86±1672281.53 | 76.62±17.72 |
| p__Pseudomonadota | 2616167.41±843460.54 | 83.21±17.89 |
| p__Verrucomicrobiota | 2538115.51±808033.83 | 81.21±14.05 |
| p__Bacillota_A | 2537653.23±880765.5 | 81.48±16.82 |
| p__Desulfobacterota | 2207699.48±163896.46 | 85.49±13.65 |
| p__Synergistota | 2141226.18±176298.47 | 75.8±16.94 |
| p__Spirochaetota | 2053326.59±36521.71 | 94.71±0.12 |
| p__Actinomycetota | 1998831.26±620954.65 | 76.63±18.44 |
| p__Bacillota_B | 1932803.47±73181.05 | 90±2.28 |
| p__Cyanobacteriota | 1877859.37±192085.35 | 92.95±17.85 |
| p__Bacillota_C | 1638316.23±205183.8 | 90.94±16.02 |
| p__Bacillota | 1551655.29±517109.39 | 83.87±12.72 |
| p__Campylobacterota | NA | NA |
| p__Patescibacteria | 1042754.15±629671.65 | 77.42±17.81 |
wild_genome_metadata %>%
summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>%
arrange(-average_size) %>%
select(-Size, -Completeness) %>%
tt()| average_size | sd_size | average_comp | sd_comp |
|---|---|---|---|
| 2655093 | 1210519 | 80.9823 | 16.79731 |
Mags average size and completeness by phylum in captivity
pre_data <-sample_metadata %>%
filter(diet=="Pre_grass")
pre_genome <- genome_counts %>%
select(genome, all_of(pre_data$sample)) %>%
filter(rowSums(across(where(is.numeric)))!=0)
pre_genome_metadata <- genome_metadata %>%
filter(genome %in% pre_genome$genome) %>%
arrange(match(genome,pre_genome$genome))
pre_genome_metadata %>%
group_by(phylum) %>%
summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>%
arrange(-average_size) %>%
select(phylum, Size, Completeness) %>%
tt()| phylum | Size | Completeness |
|---|---|---|
| p__Bacteroidota | 4150649.86±1672281.53 | 76.62±17.72 |
| p__Pseudomonadota | 2605452.27±846537.48 | 82.87±18.16 |
| p__Verrucomicrobiota | 2538115.51±808033.83 | 81.21±14.05 |
| p__Bacillota_A | 2534693.22±882541.15 | 81.5±16.81 |
| p__Desulfobacterota | 2207699.48±163896.46 | 85.49±13.65 |
| p__Synergistota | 2141226.18±176298.47 | 75.8±16.94 |
| p__Spirochaetota | 2053326.59±36521.71 | 94.71±0.12 |
| p__Actinomycetota | 1998831.26±620954.65 | 76.63±18.44 |
| p__Bacillota_B | 1932803.47±73181.05 | 90±2.28 |
| p__Cyanobacteriota | 1877859.37±192085.35 | 92.95±17.85 |
| p__Methanobacteriota | NA | NA |
| p__Bacillota_C | 1638316.23±205183.8 | 90.94±16.02 |
| p__Bacillota | 1551655.29±517109.39 | 83.87±12.72 |
| p__Campylobacterota | NA | NA |
| p__Patescibacteria | 1048485.03±657316.38 | 76.17±17.99 |
pre_genome_metadata %>%
summarise(average_size=mean(corrected_size),sd_size=sd(corrected_size),average_comp=mean(completeness),sd_comp=sd(completeness)) %>%
mutate(Size=str_c(round(average_size,2),"±",round(sd_size,2)),
Completeness=str_c(round(average_comp,2),"±",round(sd_comp,2))) %>%
arrange(-average_size) %>%
select(-Size, -Completeness)# A tibble: 1 × 4
average_size sd_size average_comp sd_comp
<dbl> <dbl> <dbl> <dbl>
1 2653523. 1210343. 81.0 16.8
4.5 Genome quality
genome_metadata %>%
summarise(completeness_mean=mean(completeness) %>% round(2) %>% as.character(),
completeness_sd=sd(completeness) %>% round(2) %>% as.character(),
contamination_mean=mean(contamination) %>% round(2),
contamination_sd=sd(contamination) %>% round(2)) %>%
unite("Completeness",completeness_mean, completeness_sd, sep = " ± ", remove = TRUE) %>%
unite("Contamination",contamination_mean, contamination_sd, sep = " ± ", remove = TRUE) %>%
tt()| Completeness | Contamination |
|---|---|
| 80.99 ± 16.79 | 2.21 ± 2.32 |
#Generate quality biplot
genome_biplot <- genome_metadata %>%
select(c(genome,domain,phylum,completeness,contamination,length)) %>%
arrange(match(genome, rev(genome_tree$tip.label))) %>% #sort MAGs according to phylogenetic tree
ggplot(aes(x=completeness,y=contamination,size=length,color=phylum)) +
geom_point(alpha=0.7) +
ylim(c(10,0)) +
scale_color_manual(values=phylum_colors) +
labs(y= "Contamination", x = "Completeness") +
theme_classic() +
theme(legend.position = "none")
#Generate contamination boxplot
genome_contamination <- genome_metadata %>%
ggplot(aes(y=contamination)) +
ylim(c(10,0)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_void() +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.margin = unit(c(0, 0, 0.40, 0),"inches")) #add bottom-margin (top, right, bottom, left)
#Generate completeness boxplot
genome_completeness <- genome_metadata %>%
ggplot(aes(x=completeness)) +
xlim(c(50,100)) +
geom_boxplot(colour = "#999999", fill="#cccccc") +
theme_void() +
theme(legend.position = "none",
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
plot.margin = unit(c(0, 0, 0, 0.50),"inches")) #add left-margin (top, right, bottom, left)
#Render composite figure
grid.arrange(grobs = list(genome_completeness,genome_biplot,genome_contamination),
layout_matrix = rbind(c(1,1,1,1,1,1,1,1,1,1,1,4),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3),
c(2,2,2,2,2,2,2,2,2,2,2,3)))4.6 Functional overview
genome_annotations <- read_tsv("data/genome_annotations.tsv.xz") %>%
rename(gene=1, genome=2, contig=3)
# Predicted genes
genome_annotations %>%
nrow()[1] 1362707
# Some annotation
genome_annotations %>%
filter(if_any(c(kegg_id, pfam_hits, cazy_hits), ~ !is.na(.))) %>%
nrow()[1] 1045555
[1] 610762
# Aggregate basal GIFT into elements
genome_gifts<-genome_gifts_raw
function_table <- genome_gifts %>%
to.elements(., GIFT_db)
# Generate basal tree
function_tree <- force.ultrametric(genome_tree, method="extend") %>%
ggtree(., size = 0.3) ***************************************************************
* Note: *
* force.ultrametric does not include a formal method to *
* ultrametricize a tree & should only be used to coerce *
* a phylogeny that fails is.ultrametric due to rounding -- *
* not as a substitute for formal rate-smoothing methods. *
***************************************************************
#Add phylum colors next to the tree tips
function_tree <- gheatmap(function_tree, phylum_heatmap, offset=0, width=0.1, colnames=FALSE) +
scale_fill_manual(values=phylum_colors) +
labs(fill="Phylum")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
#Add functions heatmap
function_tree <- gheatmap(function_tree, function_table, offset=0.5, width=3.5, colnames=FALSE) +
vexpand(.08) +
coord_cartesian(clip = "off") +
scale_fill_gradient(low = "#f4f4f4", high = "steelblue", na.value="white") +
labs(fill="GIFT")
#Reset fill scale to use a different colour profile in the heatmap
function_tree <- function_tree + new_scale_fill()
# Add completeness barplots
function_tree <- function_tree +
geom_fruit(data=genome_metadata,
geom=geom_bar,
grid.params=list(axis="x", text.size=2, nbreak = 1),
axis.params=list(vline=TRUE),
mapping = aes(x=length, y=genome, fill=completeness),
offset = 3.8,
orientation="y",
stat="identity") +
scale_fill_gradient(low = "#cf8888", high = "#a2cc87") +
labs(fill="Genome\ncompleteness")
function_tree4.7 Functional ordination
# Generate the tSNE ordination
tSNE_function <- Rtsne(X=function_table, dims = 2, check_duplicates = FALSE)
# Plot the ordination
function_ordination <- tSNE_function$Y %>%
as.data.frame() %>%
mutate(genome=rownames(function_table)) %>%
inner_join(genome_metadata, by="genome") %>%
rename(tSNE1="V1", tSNE2="V2") %>%
select(genome,phylum,tSNE1,tSNE2, length) %>%
ggplot(aes(x = tSNE1, y = tSNE2, color = phylum, size=length))+
geom_point(shape=16, alpha=0.7) +
scale_color_manual(values=phylum_colors) +
theme_minimal() +
labs(color="Phylum", size="Genome size") +
guides(color = guide_legend(override.aes = list(size = 5))) # enlarge Phylum dots in legend
function_ordination